In [4]:
!pip install plotly
Requirement already satisfied: plotly in c:\users\mainuddin\anaconda3\lib\site-packages (5.9.0)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\mainuddin\anaconda3\lib\site-packages (from plotly) (8.0.1)
WARNING: There was an error checking the latest version of pip.
In [ ]:
 
In [1]:
import pandas as pd
import datetime
from datetime import date,timedelta

import plotly.graph_objects as go                  # Slicer
import plotly.express as px                        #Visualization
import plotly.io as pio                            #template

pio.templates.default="plotly_white"
In [2]:
#1)	Read the data and display the first 100 rows from the data
data=pd.read_excel("twtr.xlsx")
print(data.head(100))
         Date       Open       High        Low      Close  Adj Close  \
0  2013-11-07  45.099998  50.090000  44.000000  44.900002  44.900002   
1  2013-11-08  45.930000  46.939999  40.685001  41.650002  41.650002   
2  2013-11-11  40.500000  43.000000  39.400002  42.900002  42.900002   
3  2013-11-12  43.660000  43.779999  41.830002  41.900002  41.900002   
4  2013-11-13  41.029999  42.869999  40.759998  42.599998  42.599998   
..        ...        ...        ...        ...        ...        ...   
95 2014-03-27  45.090000  46.400002  43.310001  46.320000  46.320000   
96 2014-03-28  46.650002  47.340000  45.700001  47.299999  47.299999   
97 2014-03-31  47.549999  47.750000  46.430000  46.669998  46.669998   
98 2014-04-01  46.709999  47.590000  46.180000  46.980000  46.980000   
99 2014-04-02  47.400002  47.439999  45.509998  45.730000  45.730000   

         Volume  
0   117701670.0  
1    27925307.0  
2    16113941.0  
3     6316755.0  
4     8688325.0  
..          ...  
95   15507597.0  
96    9610491.0  
97    5794497.0  
98    6916147.0  
99    7911260.0  

[100 rows x 7 columns]
In [3]:
#Give the column insights 
print(data.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2264 entries, 0 to 2263
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       2264 non-null   datetime64[ns]
 1   Open       2259 non-null   float64       
 2   High       2259 non-null   float64       
 3   Low        2259 non-null   float64       
 4   Close      2259 non-null   float64       
 5   Adj Close  2259 non-null   float64       
 6   Volume     2259 non-null   float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 123.9 KB
None
In [4]:
# 3) Check whether this dataset contains any null values or not 

print(data.isnull().sum())
Date         0
Open         5
High         5
Low          5
Close        5
Adj Close    5
Volume       5
dtype: int64
In [5]:
# if it is there then remove the null values from it 

data=data.dropna()
print(data.head(100))
         Date       Open       High        Low      Close  Adj Close  \
0  2013-11-07  45.099998  50.090000  44.000000  44.900002  44.900002   
1  2013-11-08  45.930000  46.939999  40.685001  41.650002  41.650002   
2  2013-11-11  40.500000  43.000000  39.400002  42.900002  42.900002   
3  2013-11-12  43.660000  43.779999  41.830002  41.900002  41.900002   
4  2013-11-13  41.029999  42.869999  40.759998  42.599998  42.599998   
..        ...        ...        ...        ...        ...        ...   
95 2014-03-27  45.090000  46.400002  43.310001  46.320000  46.320000   
96 2014-03-28  46.650002  47.340000  45.700001  47.299999  47.299999   
97 2014-03-31  47.549999  47.750000  46.430000  46.669998  46.669998   
98 2014-04-01  46.709999  47.590000  46.180000  46.980000  46.980000   
99 2014-04-02  47.400002  47.439999  45.509998  45.730000  45.730000   

         Volume  
0   117701670.0  
1    27925307.0  
2    16113941.0  
3     6316755.0  
4     8688325.0  
..          ...  
95   15507597.0  
96    9610491.0  
97    5794497.0  
98    6916147.0  
99    7911260.0  

[100 rows x 7 columns]
In [6]:
    # 4)	Find the statistical description of the data.

print(data.describe())
              Open         High          Low        Close    Adj Close  \
count  2259.000000  2259.000000  2259.000000  2259.000000  2259.000000   
mean     36.020286    36.699881    35.339465    36.003625    36.003625   
std      14.118463    14.372057    13.828724    14.089989    14.089989   
min      13.950000    14.220000    13.725000    14.010000    14.010000   
25%      25.550000    26.215001    24.912501    25.410000    25.410000   
50%      35.419998    36.099998    34.820000    35.490002    35.490002   
75%      44.205000    45.015000    43.327501    44.135000    44.135000   
max      78.360001    80.750000    76.050003    77.629997    77.629997   

             Volume  
count  2.259000e+03  
mean   2.175186e+07  
std    1.909988e+07  
min    0.000000e+00  
25%    1.233530e+07  
50%    1.691305e+07  
75%    2.428082e+07  
max    2.692131e+08  
In [7]:
# 4)	Find the statistical description of the data.

print(data.isnull())
       Date   Open   High    Low  Close  Adj Close  Volume
0     False  False  False  False  False      False   False
1     False  False  False  False  False      False   False
2     False  False  False  False  False      False   False
3     False  False  False  False  False      False   False
4     False  False  False  False  False      False   False
...     ...    ...    ...    ...    ...        ...     ...
2254  False  False  False  False  False      False   False
2255  False  False  False  False  False      False   False
2256  False  False  False  False  False      False   False
2257  False  False  False  False  False      False   False
2258  False  False  False  False  False      False   False

[2259 rows x 7 columns]
In [ ]:
 
In [8]:
# 6) Give me the Z-test O/R T-test over High, low, and close columns and see if the null 
#    hypothesis gets rejected or accepted

# program for z test (for High)


import statistics as st
from statsmodels.stats import weightstats as stest
from numpy import random

high=data['High']
print(high)

high_mean = st.mean(high)
print('Mean Data = ',high_mean)

high_stdv = st.stdev(high)
print('Standard daviation = ',high_stdv)

ztest,pval = stest.ztest(high,value=50)
print('Z test score ',ztest)
print('p value ',pval)

if pval<0.05:
    print('Reject Null hypothesis')
else:
    print('Accept Null hypothesis')
0       50.090000
1       46.939999
2       43.000000
3       43.779999
4       42.869999
          ...    
2254    50.750000
2255    51.860001
2256    53.180000
2257    53.500000
2258    54.000000
Name: High, Length: 2259, dtype: float64
Mean Data =  36.69988069278442
Standard daviation =  14.372056692309659
Z test score  -43.984000817502285
p value  0.0
Reject Null hypothesis
In [9]:
# program for z test (for Low)

low=data['Low']

low_mean=st.mean(low)
print('Mean = ',low_mean)

low_stdv=st.stdev(low)
print('Standard Daviation = ',low_stdv)

ztest,pval=stest.ztest(low,value=30)
print('Z test score = ',ztest)
print('p-value = ',pval)

if pval<0.05:
    print('Reject null hypothesis')
else:
    print('Accept null hypothesis')
    
Mean =  35.339464800354136
Standard Daviation =  13.828723572649182
Z test score =  18.35159210972168
p-value =  3.205795428365976e-75
Reject null hypothesis
In [10]:
# program for z test (for Close)

close=data['Close']

close_mean=st.mean(close)
print('Mean = ',close_mean)

close_stdv=st.stdev(close)
print('Standard Daviation ',close_stdv)

ztest,pval=stest.ztest(close,value=30)
print('Z test score = ',ztest)
print('p-value = ',pval)

if pval<0.05:
    print('Reject Null hypothesis')
else:
    print('accept null hypothesis')
Mean =  36.00362549048251
Standard Daviation  14.08998893401524
Z test score =  20.251679995652125
p-value =  3.433291673809823e-91
Reject Null hypothesis
In [11]:
# program for t test   ( for High)

import statistics as st
from scipy.stats import ttest_1samp

high=data['High']

high_mean=st.mean(high)
print('Mean = ',high_mean)

high_stdv=st.stdev(high)
print('Standard Daviaion = ',high_stdv)

t_test,pval=ttest_1samp(high, 30)
print('t test score = ',t_test)
print('p-value = ',pval)

if pval<0.05:
    print('reject null hypothesis')
else:
    print('Accept null hypothesis')
Mean =  36.69988069278442
Standard Daviaion =  14.372056692309659
t test score =  22.15676048174423
p-value =  1.3745989635531811e-98
reject null hypothesis
In [12]:
# 7)By using ANOVA find the Fvalue and Pvalue from the data and see its Acceptance and rejection of the Null hypothesis.

import pandas as pd 
import scipy.stats
import io 

data=pd.read_excel("twtr.xlsx")

print(data.head(5))

data=data.dropna()

grp1=data['High']
grp2=data['Low']
print(grp1.head(5))
print(grp2.head(5))

F,pval =scipy.stats.f_oneway(grp1,grp2)
print('F value ' ,F)
print('p-value = ',pval)

if pval<0.05:
    print('reject Null hypothesis')
else:
    print('accept Null Hypothesis')
        Date       Open       High        Low      Close  Adj Close  \
0 2013-11-07  45.099998  50.090000  44.000000  44.900002  44.900002   
1 2013-11-08  45.930000  46.939999  40.685001  41.650002  41.650002   
2 2013-11-11  40.500000  43.000000  39.400002  42.900002  42.900002   
3 2013-11-12  43.660000  43.779999  41.830002  41.900002  41.900002   
4 2013-11-13  41.029999  42.869999  40.759998  42.599998  42.599998   

        Volume  
0  117701670.0  
1   27925307.0  
2   16113941.0  
3    6316755.0  
4    8688325.0  
0    50.090000
1    46.939999
2    43.000000
3    43.779999
4    42.869999
Name: High, dtype: float64
0    44.000000
1    40.685001
2    39.400002
3    41.830002
4    40.759998
Name: Low, dtype: float64
F value  10.510084066032649
p-value =  0.0011958722702637046
reject Null hypothesis
In [13]:
# 8)	Check if the data is dependent or independent by using the chi-square method.

from scipy import stats
datas={'High':data['High'],'Low':data['Low']}
print(datas)

chisq,pval=scipy.stats.chisquare(datas['Low'])

alpha=0.05
print('Chi-square value = ',chisq)
print('p-value = ',pval)

if pval<alpha:
    print('Dependent (reject Ho)')
else:
    print('Independnt (Accept Ho)')
{'High': 0       50.090000
1       46.939999
2       43.000000
3       43.779999
4       42.869999
          ...    
2254    50.750000
2255    51.860001
2256    53.180000
2257    53.500000
2258    54.000000
Name: High, Length: 2259, dtype: float64, 'Low': 0       44.000000
1       40.685001
2       39.400002
3       41.830002
4       40.759998
          ...    
2254    49.549999
2255    50.520000
2256    52.200001
2257    52.770000
2258    53.700001
Name: Low, Length: 2259, dtype: float64}
Chi-square value =  12218.788864355833
p-value =  0.0
Dependent (reject Ho)
In [14]:
# Outliers
# find the anomalies or any
# outliers in the data or not.

from scipy import stats
import pandas as pd
import numpy as np

mu,sigma=100,5

array=np.random.normal(mu,sigma,200)

array[90] = 180
array[50] = -40

df=pd.DataFrame(array,columns=['Data'])
print(df)

z=np.abs(stats.zscore(df))
print(z)

print('No of Outliers= ',df[z>3].count())
print('Outliers are: ',df[(z>3)|(z<3)])
df_new=df[(z>-3)&(z<3)]

print(df_new)
print(df.shape)
print(df_new.shape)
           Data
0     98.907696
1     99.774472
2    101.983482
3     99.685256
4    105.027707
..          ...
195  107.481660
196  106.674779
197   87.443957
198   92.601350
199   93.575813

[200 rows x 1 columns]
         Data
0    0.050156
1    0.019840
2    0.198228
3    0.012636
4    0.444063
..        ...
195  0.642230
196  0.577071
197  0.975904
198  0.559422
199  0.480729

[200 rows x 1 columns]
No of Outliers=  Data    2
dtype: int64
Outliers are:             Data
0     98.907696
1     99.774472
2    101.983482
3     99.685256
4    105.027707
..          ...
195  107.481660
196  106.674779
197   87.443957
198   92.601350
199   93.575813

[200 rows x 1 columns]
           Data
0     98.907696
1     99.774472
2    101.983482
3     99.685256
4    105.027707
..          ...
195  107.481660
196  106.674779
197   87.443957
198   92.601350
199   93.575813

[200 rows x 1 columns]
(200, 1)
(200, 1)
In [15]:
data=pd.read_excel('twtr.xlsx')
print(data.isnull().sum())
Date         0
Open         5
High         5
Low          5
Close        5
Adj Close    5
Volume       5
dtype: int64
In [16]:
data=data.dropna()
In [17]:
print(data.isnull().sum())
Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64
In [18]:
print(data.head(50))
         Date       Open       High        Low      Close  Adj Close  \
0  2013-11-07  45.099998  50.090000  44.000000  44.900002  44.900002   
1  2013-11-08  45.930000  46.939999  40.685001  41.650002  41.650002   
2  2013-11-11  40.500000  43.000000  39.400002  42.900002  42.900002   
3  2013-11-12  43.660000  43.779999  41.830002  41.900002  41.900002   
4  2013-11-13  41.029999  42.869999  40.759998  42.599998  42.599998   
5  2013-11-14  42.340000  45.669998  42.240002  44.689999  44.689999   
6  2013-11-15  45.250000  45.270000  43.430000  43.980000  43.980000   
7  2013-11-18  43.500000  43.950001  40.849998  41.139999  41.139999   
8  2013-11-19  41.389999  41.900002  40.000000  41.750000  41.750000   
9  2013-11-20  41.400002  41.750000  40.509998  41.049999  41.049999   
10 2013-11-21  41.250000  42.490002  40.369999  42.060001  42.060001   
11 2013-11-22  41.810001  42.279999  40.970001  41.000000  41.000000   
12 2013-11-25  41.080002  41.139999  38.799999  39.060001  39.060001   
13 2013-11-26  39.160000  40.549999  38.919998  40.180000  40.180000   
14 2013-11-27  40.470001  41.400002  40.349998  40.900002  40.900002   
15 2013-11-29  41.400002  41.580002  40.900002  41.570000  41.570000   
16 2013-12-02  41.790001  42.000000  40.400002  40.779999  40.779999   
17 2013-12-03  40.689999  41.599998  40.540001  41.369999  41.369999   
18 2013-12-04  41.270000  43.919998  41.270000  43.689999  43.689999   
19 2013-12-05  43.450001  46.349998  42.830002  45.619999  45.619999   
20 2013-12-06  45.750000  45.799999  44.540001  44.950001  44.950001   
21 2013-12-09  45.590000  49.840000  45.020000  49.139999  49.139999   
22 2013-12-10  48.900002  52.580002  48.700001  51.990002  51.990002   
23 2013-12-11  52.400002  53.869999  51.000000  52.340000  52.340000   
24 2013-12-12  52.200001  55.869999  50.689999  55.330002  55.330002   
25 2013-12-13  56.200001  59.410000  55.450001  59.000000  59.000000   
26 2013-12-16  57.860001  60.240002  55.759998  56.610001  56.610001   
27 2013-12-17  56.970001  57.380001  54.619999  56.450001  56.450001   
28 2013-12-18  57.000000  57.000000  54.230000  55.509998  55.509998   
29 2013-12-19  55.080002  57.750000  55.000000  57.490002  57.490002   
30 2013-12-20  58.509998  60.250000  58.009998  60.009998  60.009998   
31 2013-12-23  59.849998  64.989998  59.700001  64.540001  64.540001   
32 2013-12-24  66.339996  70.870003  65.559998  69.959999  69.959999   
33 2013-12-26  72.879997  74.730003  69.130096  73.309998  73.309998   
34 2013-12-27  70.099998  71.250000  63.689999  63.750000  63.750000   
35 2013-12-30  60.270000  63.709999  58.570000  60.509998  60.509998   
36 2013-12-31  62.360001  65.220001  61.650002  63.650002  63.650002   
37 2014-01-02  65.000000  67.500000  64.400002  67.500000  67.500000   
38 2014-01-03  69.000000  70.430000  68.431999  69.000000  69.000000   
39 2014-01-06  64.830002  66.870003  63.500000  66.290001  66.290001   
40 2014-01-07  67.669998  67.730003  61.389999  61.459999  61.459999   
41 2014-01-08  58.709999  61.259998  57.919998  59.290001  59.290001   
42 2014-01-09  59.540001  60.810001  55.590000  57.049999  57.049999   
43 2014-01-10  57.500000  58.759998  55.869999  57.000000  57.000000   
44 2014-01-13  59.980000  60.380001  57.293999  57.820000  57.820000   
45 2014-01-14  58.880001  59.020000  57.360001  58.209999  58.209999   
46 2014-01-15  59.110001  61.750000  58.320000  61.570000  61.570000   
47 2014-01-16  61.450001  62.400002  60.459999  60.570000  60.570000   
48 2014-01-17  63.599998  64.690002  61.590000  62.200001  62.200001   
49 2014-01-21  63.330002  63.439999  61.500000  62.529999  62.529999   

         Volume  
0   117701670.0  
1    27925307.0  
2    16113941.0  
3     6316755.0  
4     8688325.0  
5    11099433.0  
6     8010663.0  
7    12810624.0  
8     7436616.0  
9     5767325.0  
10    8324753.0  
11    6185245.0  
12   14333375.0  
13    9828433.0  
14    5536322.0  
15    4107074.0  
16    6427386.0  
17    5776893.0  
18   11028953.0  
19   11813520.0  
20    6236232.0  
21   17366614.0  
22   25792002.0  
23   26631535.0  
24   23446870.0  
25   38979567.0  
26   39310848.0  
27   22115199.0  
28   16659776.0  
29   13174896.0  
30   26207420.0  
31   22163787.0  
32   35802698.0  
33   82761072.0  
34   60418668.0  
35   55538253.0  
36   27858516.0  
37   29286655.0  
38   33254610.0  
39   27303649.0  
40   31806111.0  
41   27304350.0  
42   31121971.0  
43   22391578.0  
44   21039027.0  
45   14810026.0  
46   21646397.0  
47   16755251.0  
48   28440701.0  
49   13739691.0  
In [19]:
# 11)	Show the Twitter stock prices over the years  and give a conclusion.
import plotly.graph_objects as go   
import pandas as pd
figure=go.Figure(data = [go.Candlestick(x=data['Date'],
                                        open=data['Open'],
                                        high=data['High'],
                                        low=data['Low'],
                                        close=data['Close'])])

#figure.update_layout(title="Twitter stock prices over the years ",xaxis_rangeslider_visible=False)
figure.show()
In [20]:
#12)Now compare the close vs date column for Twitter prices over the years.
import plotly.express as px
figure=px.bar(data,
             x='Date',
             y='Close',
            color='Close' )
figure.update_xaxes(rangeslider_visible=True)
figure.show()
In [21]:
# 13)	Visualizing the missing values With the help of a heatmap

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(16,10))

sns.heatmap(data.isnull(),cbar=False,cmap="YlGnBu")
plt.show()
In [22]:
import pandas as pd

dt=pd.read_excel('twtr.xlsx')

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(16,10))
sns.heatmap(dt.isnull(),cbar=True,cmap='YlGnBu')
plt.show()
In [23]:
# #14)Assign buttons to control time periods.  Add the  buttons to analyze the stock prices of Twitter in different time periods:



figure=px.bar(data,x='Date',y='Close',color='Close')
figure.update_xaxes(rangeslider_visible=True)

figure.update_layout(title='Twitter stock prices over the years',xaxis_rangeslider_visible=False)

figure.update_xaxes(

rangeselector=dict(
buttons=list([

dict(count=1,label='1m',step='month',stepmode='backward'),
dict(count=3,label='3m',step='month',stepmode='backward'),
dict(count=6,label='6m',step='month',stepmode='backward'),
dict(count=1,label='1y',step='year',stepmode='backward'),
dict(count=2,label='2y',step='year',stepmode='backward'),
dict(step='all'),

])
)
)

figure.show()
In [24]:
# 15)	Give the complete timeline of Twitter in the stock market. (Line Graph)
data['Date']=pd.to_datetime(data['Date'],format='%Y-%m-%d')
data['Year']=data['Date'].dt.year
data['Month']=data['Date'].dt.month

fig=px.line(data,
           x='Month',
           y='Close',
           color='Year',
           title='Complete timeline pf twitter')
fig.show()
In [25]:
import plotly.figure_factory as ff
import numpy as np

x=np.random.rand(15,20)

fig=ff.create_dendrogram(x)
fig.update_layout({'plot_bgcolor':'white'})

fig.show()
In [ ]:
 
In [ ]: